{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11c4bd54",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621\u001b[0m\u001b[33m\n",
      "\u001b[0mCollecting git+https://github.com/boudinfl/pke.git\n",
      "  Cloning https://github.com/boudinfl/pke.git to /private/var/folders/gf/q45ftd6d5z7c8tcytwj5czh80000gp/T/pip-req-build-sg_8n42x\n",
      "  Running command git clone --filter=blob:none --quiet https://github.com/boudinfl/pke.git /private/var/folders/gf/q45ftd6d5z7c8tcytwj5czh80000gp/T/pip-req-build-sg_8n42x\n"
     ]
    }
   ],
   "source": [
    "!pip install git+https://github.com/boudinfl/pke.git\n",
    "!pip install datasets\n",
    "!python -m spacy download en_core_web_sm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fb58a4a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "from spacy.tokenizer import _get_regex_pattern\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_sm\")\n",
    "\n",
    "# Tokenization fix for in-word hyphens (e.g. 'non-linear' would be kept \n",
    "# as one token instead of default spacy behavior of 'non', '-', 'linear')\n",
    "# https://spacy.io/usage/linguistic-features#native-tokenizer-additions\n",
    "\n",
    "from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER\n",
    "from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS\n",
    "from spacy.util import compile_infix_regex\n",
    "\n",
    "# Modify tokenizer infix patterns\n",
    "infixes = (\n",
    "    LIST_ELLIPSES\n",
    "    + LIST_ICONS\n",
    "    + [\n",
    "        r\"(?<=[0-9])[+\\-\\*^](?=[0-9-])\",\n",
    "        r\"(?<=[{al}{q}])\\.(?=[{au}{q}])\".format(\n",
    "            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES\n",
    "        ),\n",
    "        r\"(?<=[{a}]),(?=[{a}])\".format(a=ALPHA),\n",
    "        # ✅ Commented out regex that splits on hyphens between letters:\n",
    "        # r\"(?<=[{a}])(?:{h})(?=[{a}])\".format(a=ALPHA, h=HYPHENS),\n",
    "        r\"(?<=[{a}0-9])[:<>=/](?=[{a}])\".format(a=ALPHA),\n",
    "    ]\n",
    ")\n",
    "\n",
    "infix_re = compile_infix_regex(infixes)\n",
    "nlp.tokenizer.infix_finditer = infix_re.finditer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "7f9e3c81",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:datasets.builder:No config specified, defaulting to: sem_eval/raw\n",
      "WARNING:datasets.builder:Reusing dataset sem_eval (/Users/boudin-f/.cache/huggingface/datasets/taln-ls2n___sem_eval/raw/1.0.0/b40e008b5c96137733e24d9d244d70aa1fe6353ee65e180d8f6948af4027fbe4)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1978f219b4e34da7b340e8e11f582192",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2b6c15c3658244d39ad6839da16edbe9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/144 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "81dfef83378544879fa86c6fef2619eb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from tqdm.notebook import tqdm\n",
    "from datasets import load_dataset\n",
    "\n",
    "benchmark = \"semeval-2010-pre\"\n",
    "# load the inspec dataset\n",
    "dataset = load_dataset('taln-ls2n/'+benchmark)\n",
    "\n",
    "# pre-process training and test splits\n",
    "train = []\n",
    "for sample in tqdm(dataset['train']):\n",
    "    train.append(nlp(sample[\"title\"]+\". \"+sample[\"abstract\"]))\n",
    "test = []\n",
    "for sample in tqdm(dataset['test']):\n",
    "    test.append(nlp(sample[\"title\"]+\". \"+sample[\"abstract\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "42e727ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pke import compute_document_frequency, compute_lda_model\n",
    "from string import punctuation\n",
    "\n",
    "\n",
    "# computing LDA and DF counts\n",
    "\n",
    "compute_document_frequency(\n",
    "    documents=train,\n",
    "    output_file='data/{}.df.gz'.format(benchmark),\n",
    "    language='en',              # language of the input files\n",
    "    normalization='stemming',   # use porter stemmer\n",
    "    stoplist=list(punctuation), # stoplist (punctuation marks)\n",
    "    n=5                         # compute n-grams up to 5-grams\n",
    ")\n",
    "\n",
    "compute_lda_model(\n",
    "    documents=train,\n",
    "    output_file=\"data/{}.lda.pickle.gz\".format(benchmark),\n",
    "    n_topics=500,               # number of topics\n",
    "    language='en',              # language of the input files\n",
    "    stoplist=list(punctuation), # stoplist (punctuation marks)\n",
    "    normalization='stemming'    # use porter stemmer\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "7853dfdc",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pke import load_document_frequency_file, load_lda_model\n",
    "\n",
    "df = load_document_frequency_file(input_file='data/{}.df.gz'.format(benchmark))\n",
    "lda_model = load_lda_model(input_file=\"data/{}.lda.pickle.gz\".format(benchmark))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "be4f6ef4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3b94f01b04804c4ba7a31c4a89497a02",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "68606b2503c94f97a046d166843f9382",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c6c4d8899a2f480ab9cc4cab1724568f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bd411c7e82a1497d86832f218a8e66ae",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c415bb66583e4f21ad1b802093be0ccb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fe9a9f0ed8c842829dd520f49650fc14",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "00885a3c1491407a81c131462da65e87",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a15ac0dc92ae40559206214bd7489827",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pke.unsupervised import *\n",
    "from timeit import default_timer as timer\n",
    "\n",
    "outputs = {}\n",
    "elapsed_times = {}\n",
    "#for model in [TopicalPageRank]:\n",
    "for model in [FirstPhrases, TextRank, SingleRank, TopicRank, PositionRank, MultipartiteRank, TfIdf, TopicalPageRank]:\n",
    "    outputs[model.__name__] = []\n",
    "    \n",
    "    extractor = model()\n",
    "    start = timer()\n",
    "    for i, doc in enumerate(tqdm(test)):\n",
    "        extractor.load_document(input=doc, language='en')\n",
    "        extractor.grammar_selection(grammar=\"NP: {<ADJ>*<NOUN|PROPN>+}\")\n",
    "        if model.__name__ in [\"TfIdf\"]:\n",
    "            extractor.candidate_weighting(df=df)\n",
    "        elif model.__name__ in [\"TopicalPageRank\"]:\n",
    "            extractor.candidate_weighting(lda_model=lda_model)\n",
    "        else:\n",
    "            extractor.candidate_weighting()\n",
    "        outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=10, stemming=True)])\n",
    "    end = timer()\n",
    "    elapsed_times[model.__name__] = end - start\n",
    "    #print(\"Elapsed time for {}: {:.2f}s - {:.2f} it/s\".format(model.__name__, end - start,  len(test) / (end - start)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "1249a8c4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0aa66a232d204943864393fb80f51b4d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from nltk.stem.snowball import SnowballStemmer as Stemmer\n",
    "import numpy as np\n",
    "    \n",
    "# populates the references list with stemmed keyphrases\n",
    "references = []\n",
    "for sample in tqdm(dataset['test']):\n",
    "    sample_keyphrases = []\n",
    "    for keyphrase in sample[\"keyphrases\"]:\n",
    "        # tokenize keyphrase\n",
    "        tokens = [token.text for token in nlp(keyphrase)]\n",
    "        # normalize tokens using Porter's stemming\n",
    "        stems = [Stemmer('porter').stem(tok.lower()) for tok in tokens]\n",
    "        sample_keyphrases.append(\" \".join(stems))\n",
    "    references.append(sample_keyphrases)\n",
    "\n",
    "def evaluate(top_N_keyphrases, references, cutoff=5):\n",
    "    P = len(set(top_N_keyphrases[:cutoff]) & set(references)) / len(top_N_keyphrases[:cutoff])\n",
    "    R = len(set(top_N_keyphrases[:cutoff]) & set(references)) / len(references)\n",
    "    F = (2*P*R)/(P+R) if (P+R) > 0 else 0 \n",
    "    return (P, R, F)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "f2a7d4e8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "## benchmarking on semeval-2010-pre\n",
      "| Model | it/s |  F@5 | F@10 |\n",
      "| :---- | ----:| ---: | ---: |\n",
      "| FirstPhrases  | 368.2 | 13.00 | 14.25 |\n",
      "| TextRank  | 299.6 | 8.85 | 12.97 |\n",
      "| SingleRank  | 288.0 | 11.11 | 16.23 |\n",
      "| TopicRank  | 190.3 | 11.18 | 13.81 |\n",
      "| PositionRank  | 280.4 | 12.72 | 16.64 |\n",
      "| MultipartiteRank  | 145.9 | 12.99 | 14.93 |\n",
      "| TfIdf  | 422.1 | 12.41 | 14.90 |\n",
      "| TopicalPageRank  | 37.1 | 11.40 | 16.04 |\n"
     ]
    }
   ],
   "source": [
    "print(\"## Benchmarking on {}\".format(benchmark))\n",
    "print(\"| Model | it/s |  F@5 | F@10 |\")\n",
    "print(\"| :---- | ----:| ---: | ---: |\")\n",
    "\n",
    "# loop through the models\n",
    "for model in outputs:\n",
    "    \n",
    "    f_scores = []\n",
    "    # compute the P, R, F scores for the model\n",
    "    for cutoff in [5, 10]:\n",
    "        scores = []\n",
    "        for i, output in enumerate(outputs[model]):\n",
    "            scores.append(evaluate(output, references[i], cutoff))\n",
    "\n",
    "        # compute the average scores\n",
    "        P, R, F = np.mean(scores, axis=0)\n",
    "        f_scores.append(F)\n",
    "        \n",
    "    print(\"| {}  | {:.1f} | {:.2f} | {:.2f} |\".format(model,  len(test)/ elapsed_times[model], f_scores[0]*100, f_scores[1]*100))\n",
    "\n",
    "\n",
    "        # print out the performance of the model\n",
    "        #print(\"{} at {} P: {:.3f} R: {:.3f} F: {:.3f}\".format(model, cutoff, avg_scores[0], avg_scores[1], avg_scores[2]))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b841bd1d",
   "metadata": {},
   "source": [
    "| Model | F@5 | F@10 |\n",
    "| :---  | --: | ---: |\n",
    "| FirstPhrases  | 0.24 | 0.29 |\n",
    "| TextRank  | 0.27 | 0.34 |\n",
    "| SingleRank  | 0.27 | 0.34 |\n",
    "| TopicRank  | 0.25 | 0.28 |\n",
    "| PositionRank  | 0.28 | 0.33 |\n",
    "| MultipartiteRank  | 0.25 | 0.29 |\n",
    "| TfIdf  | 0.28 | 0.35 |\n",
    "| TopicalPageRank  | 0.28 | 0.34 |"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51b18747",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
